{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ConvNet-LSTM Stack Sentiment Classifier"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this notebook, we stack an LSTM on top of a convolutional layer to classify IMDB movie reviews by their sentiment."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/the-deep-learners/deep-learning-illustrated/blob/master/notebooks/conv_lstm_stack_sentiment_classifier.ipynb)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Load dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import keras\n",
    "from keras.datasets import imdb\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Dropout, Embedding, SpatialDropout1D, LSTM\n",
    "from keras.layers.wrappers import Bidirectional \n",
    "from keras.layers import Conv1D, MaxPooling1D \n",
    "from keras.callbacks import ModelCheckpoint\n",
    "import os\n",
    "from sklearn.metrics import roc_auc_score \n",
    "import matplotlib.pyplot as plt \n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Set hyperparameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# output directory name:\n",
    "output_dir = 'model_output/cnnLSTM'\n",
    "\n",
    "# training:\n",
    "epochs = 4\n",
    "batch_size = 128\n",
    "\n",
    "# vector-space embedding: \n",
    "n_dim = 64 \n",
    "n_unique_words = 10000 \n",
    "max_review_length = 200 \n",
    "pad_type = trunc_type = 'pre'\n",
    "drop_embed = 0.2 \n",
    "\n",
    "# convolutional layer architecture:\n",
    "n_conv = 64  \n",
    "k_conv = 3 \n",
    "mp_size = 4\n",
    "\n",
    "# LSTM layer architecture:\n",
    "n_lstm = 64 \n",
    "drop_lstm = 0.2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "(x_train, y_train), (x_valid, y_valid) = imdb.load_data(num_words=n_unique_words)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Preprocess data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train = pad_sequences(x_train, maxlen=max_review_length, padding=pad_type, truncating=trunc_type, value=0)\n",
    "x_valid = pad_sequences(x_valid, maxlen=max_review_length, padding=pad_type, truncating=trunc_type, value=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "#### Design neural network architecture"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = Sequential()\n",
    "model.add(Embedding(n_unique_words, n_dim, input_length=max_review_length)) \n",
    "model.add(SpatialDropout1D(drop_embed))\n",
    "model.add(Conv1D(n_conv, k_conv, activation='relu'))\n",
    "model.add(MaxPooling1D(mp_size))\n",
    "model.add(Bidirectional(LSTM(n_lstm, dropout=drop_lstm)))\n",
    "model.add(Dense(1, activation='sigmoid'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "embedding_1 (Embedding)      (None, 200, 64)           640000    \n",
      "_________________________________________________________________\n",
      "spatial_dropout1d_1 (Spatial (None, 200, 64)           0         \n",
      "_________________________________________________________________\n",
      "conv1d_1 (Conv1D)            (None, 198, 64)           12352     \n",
      "_________________________________________________________________\n",
      "max_pooling1d_1 (MaxPooling1 (None, 49, 64)            0         \n",
      "_________________________________________________________________\n",
      "bidirectional_1 (Bidirection (None, 128)               66048     \n",
      "_________________________________________________________________\n",
      "dense_1 (Dense)              (None, 1)                 129       \n",
      "=================================================================\n",
      "Total params: 718,529\n",
      "Trainable params: 718,529\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "# LSTM layer parameters double due to both reading directions\n",
    "model.summary() "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Configure model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "modelcheckpoint = ModelCheckpoint(filepath=output_dir+\"/weights.{epoch:02d}.hdf5\")\n",
    "if not os.path.exists(output_dir):\n",
    "    os.makedirs(output_dir)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Train!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 25000 samples, validate on 25000 samples\n",
      "Epoch 1/4\n",
      "25000/25000 [==============================] - 17s 680us/step - loss: 0.4530 - acc: 0.7620 - val_loss: 0.3147 - val_acc: 0.8705\n",
      "Epoch 2/4\n",
      "25000/25000 [==============================] - 16s 627us/step - loss: 0.2436 - acc: 0.9032 - val_loss: 0.3291 - val_acc: 0.8710\n",
      "Epoch 3/4\n",
      "25000/25000 [==============================] - 16s 625us/step - loss: 0.1734 - acc: 0.9356 - val_loss: 0.3313 - val_acc: 0.8587\n",
      "Epoch 4/4\n",
      "25000/25000 [==============================] - 16s 621us/step - loss: 0.1307 - acc: 0.9525 - val_loss: 0.3380 - val_acc: 0.8532\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7f2ffdd10e80>"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_valid, y_valid), callbacks=[modelcheckpoint])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "#### Evaluate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.load_weights(output_dir+\"/weights.02.hdf5\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_hat = model.predict_proba(x_valid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAD8CAYAAACcjGjIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAEPxJREFUeJzt3X+snmV9x/H3Ryr+RkCKYS3bwVidSLLIGsSZOGcNFDCUP8DUzFFJsyaOOefMNtyWdAFJcL+YJorrpLMYJzBmRiM4wvgRt0WQgzgmMEIHDM5gcrQF3Yg/qt/98VzFA9dp+3Ce0/P0tO9XcvLc93Vf931/r57Tfs79s6kqJEma6QXjLkCStP8xHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktRZMu4C5uqoo46qiYmJcZchPdt37x98Hvb68dYhzeLOO+/8dlUtHabvog2HiYkJJicnx12G9Gz/9PbB5ztvHWcV0qyS/NewfT2tJEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnq7PUJ6SSbgXcBT1TVCa3tSOAqYAJ4GHh3Ve1IEuDjwOnA08D7qurrbZ11wB+1zX60qra09l8EPgu8BLge+GBV1TyNb1YTF1y3Lze/Ww9fcsZY9itJz9cwRw6fBVY/p+0C4KaqWgHc1OYBTgNWtK8NwGXwTJhsBN4MnARsTHJEW+ey1nfXes/dlyRpge01HKrqK8D25zSvAba06S3AWTPar6iB24DDkxwDnArcWFXbq2oHcCOwui07rKq+2o4WrpixLUnSmMz1msOrq+pxgPZ5dGtfBjw6o99Ua9tT+9Qs7ZKkMZrvC9KZpa3m0D77xpMNSSaTTE5PT8+xREnS3sw1HL7VTgnRPp9o7VPAsTP6LQce20v78lnaZ1VVm6pqZVWtXLp0qFeSS5LmYK7hsBVY16bXAdfOaD83AycDT7XTTjcApyQ5ol2IPgW4oS37XpKT251O587YliRpTIa5lfULwNuBo5JMMbjr6BLg6iTrgUeAc1r36xncxrqNwa2s5wFU1fYkFwF3tH4XVtWui9zv56e3sn65fUmSxmiv4VBV79nNolWz9C3g/N1sZzOweZb2SeCEvdUhSVo4PiEtSeoYDpKkjuEgSeoYDpKkjuEgSeoYDpKkjuEgSeoYDpKkjuEgSeoYDpKkjuEgSeoYDpKkjuEgSeoYDpKkjuEgSeoYDpKkjuEgSeoYDpKkzl7/m1BJUm/iguvGst+HLzljQfbjkYMkqWM4SJI6hoMkqWM4SJI6hoMkqWM4SJI6hoMkqWM4SJI6hoMkqWM4SJI6hoMkqWM4SJI6hoMkqTNSOCT5UJJ7knwzyReSvDjJcUluT/JAkquSHNr6vqjNb2vLJ2Zs5yOt/f4kp442JEnSqOYcDkmWAb8FrKyqE4BDgLXAx4BLq2oFsANY31ZZD+yoqtcCl7Z+JDm+rfdGYDXwqSSHzLUuSdLoRj2ttAR4SZIlwEuBx4F3ANe05VuAs9r0mjZPW74qSVr7lVX1g6p6CNgGnDRiXZKkEcw5HKrqv4E/Ax5hEApPAXcCT1bVztZtCljWppcBj7Z1d7b+r5rZPss6z5JkQ5LJJJPT09NzLV2StBejnFY6gsFv/ccBPwO8DDhtlq61a5XdLNtde99YtamqVlbVyqVLlz7/oiVJQxnltNI7gYeqarqqfgR8Efgl4PB2mglgOfBYm54CjgVoy18JbJ/ZPss6kqQxGCUcHgFOTvLSdu1gFXAvcAtwduuzDri2TW9t87TlN1dVtfa17W6m44AVwNdGqEuSNKIle+8yu6q6Pck1wNeBncBdwCbgOuDKJB9tbZe3VS4HPpdkG4MjhrVtO/ckuZpBsOwEzq+qH8+1LknS6OYcDgBVtRHY+JzmB5nlbqOq+j5wzm62czFw8Si1SJLmj09IS5I6hoMkqWM4SJI6hoMkqWM4SJI6hoMkqWM4SJI6hoMkqWM4SJI6hoMkqWM4SJI6hoMkqWM4SJI6hoMkqWM4SJI6hoMkqWM4SJI6hoMkqWM4SJI6hoMkqWM4SJI6hoMkqWM4SJI6hoMkqWM4SJI6hoMkqWM4SJI6hoMkqWM4SJI6hoMkqWM4SJI6I4VDksOTXJPkP5Lcl+QtSY5McmOSB9rnEa1vknwiybYkdyc5ccZ21rX+DyRZN+qgJEmjGfXI4ePAP1bVzwO/ANwHXADcVFUrgJvaPMBpwIr2tQG4DCDJkcBG4M3AScDGXYEiSRqPOYdDksOAtwGXA1TVD6vqSWANsKV12wKc1abXAFfUwG3A4UmOAU4Fbqyq7VW1A7gRWD3XuiRJoxvlyOE1wDTwN0nuSvKZJC8DXl1VjwO0z6Nb/2XAozPWn2ptu2uXJI3JKOGwBDgRuKyq3gT8Hz89hTSbzNJWe2jvN5BsSDKZZHJ6evr51itJGtIo4TAFTFXV7W3+GgZh8a12uoj2+cSM/sfOWH858Nge2jtVtamqVlbVyqVLl45QuiRpT+YcDlX1P8CjSV7fmlYB9wJbgV13HK0Drm3TW4Fz211LJwNPtdNONwCnJDmiXYg+pbVJksZkyYjrfwD4fJJDgQeB8xgEztVJ1gOPAOe0vtcDpwPbgKdbX6pqe5KLgDtavwuravuIdUmSRjBSOFTVN4CVsyxaNUvfAs7fzXY2A5tHqUWSNH98QlqS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEmdkcMhySFJ7krypTZ/XJLbkzyQ5Kokh7b2F7X5bW35xIxtfKS135/k1FFrkiSNZj6OHD4I3Ddj/mPApVW1AtgBrG/t64EdVfVa4NLWjyTHA2uBNwKrgU8lOWQe6pIkzdFI4ZBkOXAG8Jk2H+AdwDWtyxbgrDa9ps3Tlq9q/dcAV1bVD6rqIWAbcNIodUmSRjPqkcNfAr8H/KTNvwp4sqp2tvkpYFmbXgY8CtCWP9X6P9M+yzrPkmRDkskkk9PT0yOWLknanTmHQ5J3AU9U1Z0zm2fpWntZtqd1nt1YtamqVlbVyqVLlz6veiVJw1sywrpvBc5McjrwYuAwBkcShydZ0o4OlgOPtf5TwLHAVJIlwCuB7TPad5m5jiRpDOZ85FBVH6mq5VU1weCC8s1V9avALcDZrds64No2vbXN05bfXFXV2te2u5mOA1YAX5trXZKk0Y1y5LA7vw9cmeSjwF3A5a39cuBzSbYxOGJYC1BV9yS5GrgX2AmcX1U/3gd1SZKGNC/hUFW3Are26QeZ5W6jqvo+cM5u1r8YuHg+apEkjc4npCVJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJHcNBktQxHCRJnTmHQ5Jjk9yS5L4k9yT5YGs/MsmNSR5on0e09iT5RJJtSe5OcuKMba1r/R9Ism70YUmSRjHKkcNO4MNV9QbgZOD8JMcDFwA3VdUK4KY2D3AasKJ9bQAug0GYABuBNwMnARt3BYokaTzmHA5V9XhVfb1Nfw+4D1gGrAG2tG5bgLPa9Brgihq4DTg8yTHAqcCNVbW9qnYANwKr51qXJGl083LNIckE8CbgduDVVfU4DAIEOLp1WwY8OmO1qda2u3ZJ0piMHA5JXg78PfDbVfXdPXWdpa320D7bvjYkmUwyOT09/fyLlSQNZaRwSPJCBsHw+ar6Ymv+VjtdRPt8orVPAcfOWH058Nge2jtVtamqVlbVyqVLl45SuiRpD0a5WynA5cB9VfUXMxZtBXbdcbQOuHZG+7ntrqWTgafaaacbgFOSHNEuRJ/S2iRJY7JkhHXfCvwa8O9JvtHa/gC4BLg6yXrgEeCctux64HRgG/A0cB5AVW1PchFwR+t3YVVtH6EuSdKI5hwOVfUvzH69AGDVLP0LOH8329oMbJ5rLZKk+eUT0pKkjuEgSeoYDpKkjuEgSeoYDpKkjuEgSeqM8pyDJI3VxAXXjbuEA5ZHDpKkjuEgSeoYDpKkjuEgSeoYDpKkjuEgSeoYDpKkjuEgSer4ENwCGucDOw9fcsbY9i1p8fHIQZLUMRwkSR3DQZLUMRwkSR0vSEsamW9HPfB45CBJ6hgOkqSOp5UOEuM67Pf5CmlxMhykA4Tn/TWfDAdpnt324HdY6z/UWuQMB+1TB9tvs1e+5jvjLkGaF16QliR1DAdJUsdwkCR1DAdJUsdwkCR19ptwSLI6yf1JtiW5YNz1SNLBbL8IhySHAJ8ETgOOB96T5PjxViVJB6/9IhyAk4BtVfVgVf0QuBJYM+aaJOmgtb+EwzLg0RnzU61NkjQG+8sT0pmlrbpOyQZgQ5v93yT3z3F/RwHfnuO6i5VjXgBveWbqXQu525n8Ph/g8jFg7mP+uWE77i/hMAUcO2N+OfDYcztV1SZg06g7SzJZVStH3c5i4pgPDo754LAQY95fTivdAaxIclySQ4G1wNYx1yRJB6394sihqnYm+U3gBuAQYHNV3TPmsiTpoLVfhANAVV0PXL9Auxv51NQi5JgPDo754LDPx5yq7rqvJOkgt79cc5Ak7UcO2HDY2+s4krwoyVVt+e1JJha+yvk1xJh/J8m9Se5OclOSoW9r258N++qVJGcnqSSL/s6WYcac5N3t+31Pkr9d6Brn2xA/3z+b5JYkd7Wf8dPHUed8SbI5yRNJvrmb5UnyifbncXeSE+e1gKo64L4YXNT+T+A1wKHAvwHHP6fPbwCfbtNrgavGXfcCjPlXgJe26fcv9jEPO+7W7xXAV4DbgJXjrnsBvtcrgLuAI9r80eOuewHGvAl4f5s+Hnh43HWPOOa3AScC39zN8tOBLzN4Tuxk4Pb53P+BeuQwzOs41gBb2vQ1wKoksz2Mt1jsdcxVdUtVPd1mb2PwPMliN+yrVy4C/gT4/kIWt48MM+ZfBz5ZVTsAquqJBa5xvg0z5gIOa9OvZJZnpRaTqvoKsH0PXdYAV9TAbcDhSY6Zr/0fqOEwzOs4nulTVTuBp4BXLUh1+8bzfQXJega/dSx2ex13kjcBx1bVlxaysH1omO/164DXJfnXJLclWb1g1e0bw4z5j4H3JplicOfjBxamtLHZp68d2m9uZZ1nw7yOY6hXdiwiQ48nyXuBlcAv79OKFsYex53kBcClwPsWqqAFMMz3egmDU0tvZ3CE+M9JTqiqJ/dxbfvKMGN+D/DZqvrzJG8BPtfG/JN9X95Y7NN/ww7UI4dhXsfxTJ8kSxgchu7pEG5/N9QrSJK8E/hD4Myq+sEC1bYv7W3crwBOAG5N8jCDc7NbF/lF6WF/vq+tqh9V1UPA/QzCYrEaZszrgasBquqrwIsZvIPoQDXU3/m5OlDDYZjXcWwF1rXps4Gbq13lWaT2OuZ2euWvGATDYj8Hvcsex11VT1XVUVU1UVUTDK61nFlVk+Mpd14M8/P9DwxuQCDJUQxOMz24oFXOr2HG/AiwCiDJGxiEw/SCVrmwtgLntruWTgaeqqrH52vjB+RppdrN6ziSXAhMVtVW4HIGh53bGBwxrB1fxaMbcsx/Crwc+Lt27f2RqjpzbEXPgyHHfUAZcsw3AKckuRf4MfC7VfWd8VU9miHH/GHgr5N8iMHplfct5l/4knyBwWnBo9p1lI3ACwGq6tMMrqucDmwDngbOm9f9L+I/O0nSPnKgnlaSJI3AcJAkdQwHSVLHcJAkdQwHSVLHcJAkdQwHSVLHcJAkdf4fInDb0s5Ci2IAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7f3001f91198>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.hist(y_hat)\n",
    "_ = plt.axvline(x=0.5, color='orange')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'94.46'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"{:0.2f}\".format(roc_auc_score(y_valid, y_hat)*100.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
